Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
tmp <- fs::dir_map("R scripts/", source)
theme_set(theme_light())library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
tmp <- fs::dir_map("R scripts/", source)
theme_set(theme_light())data_source <- "real"
qpcr_dir <- str_c(get_data_dir(data_source), "03 qPCR/")
file <- "VIBRANT_qPCR_data_merged_250416.csv"
cat("We load the file", file, "\n\n")We load the file VIBRANT_qPCR_data_merged_250416.csv
qpcr <- read_csv(str_c(qpcr_dir, file))The data is provided in long format:
qpcr |> glimpse()Rows: 51,480
Columns: 20
$ Data_row <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ Well <chr> "A02", "A03", "A04", "A05", "A06", "A07", "A0…
$ Fluor <chr> "Cy5", "Cy5", "Cy5", "Cy5", "Cy5", "Cy5", "Cy…
$ Content <chr> "Unkn-01", "Unkn-01", "Unkn-01", "Unkn-02", "…
$ Replicate <dbl> 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, …
$ Sample <chr> "A1", "A1", "A1", "A2", "A2", "A2", "A3", "A3…
$ Cq <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, …
$ `Starting Quantity (SQ)` <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+…
$ `Cq Mean` <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.000…
$ plate_id <chr> "B064152", "B064152", "B064152", "B064152", "…
$ Plate_number <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ gDNA_Plate_ID <dbl> 2240890, 2240890, 2240890, 2240890, 2240890, …
$ VMRC_Group <chr> "VMRC_3", "VMRC_3", "VMRC_3", "VMRC_3", "VMRC…
$ Group_Fluor <chr> "VMRC_3_Cy5", "VMRC_3_Cy5", "VMRC_3_Cy5", "VM…
$ Target <chr> "185329", "185329", "185329", "185329", "1853…
$ gDNA_Plate_Well <chr> "2240890_A1", "2240890_A1", "2240890_A1", "22…
$ VIBR_Sample_ID <chr> "2152456", "2152456", "2152456", "2157507", "…
$ Dilution_Factor <dbl> 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3…
$ Quant_Adjusted <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+…
$ Copies_per_swab <dbl> 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, …
Data_row is the row numberWell is the well number from A02 to P19Fluor is the fluorescent dye used (Cy5, FAM, HEX)Content ???Replicate is the replicate number from 1 to 96 (with 495 missing values). Values are repeated 495, or 990 timesTODO: check what that is exactly
Sample is a sample ID — ?
Cq is the quantification cycle (Cq) value.
Starting Quantity (SQ) is ???
Cq Mean is the mean Cq value for the replicate ???
TODO: check what that is exactly
plate_id is the plate ID; there are 55 unique plate IDs in total.
Plate_number is the plate number; there are 11 unique plate numbers in total (5 plate_id per Plate_number).
gDNA_Plate_ID is the gDNA plate ID; there are 11 unique gDNA plate IDs in total (match the Plate_number).
VMRC_Group is the VMRC group; there are 5 unique VMRC groups in total (VMRC_1, VMRC_2, VMRC_3, VMRC_4, VMRC_5).
Group_Fluor combines the Fluor and VMRC_Group columns
Target is the target “gene” (taxa/strain); there are 15 unique targets in total (122010, 185329, C0006A1, C0022A1, C0028A1, C0059E1, C0112A1, C0175A1, FF00004, FF00018, FF00051, FF00064, FF00072, UC101, UC119).
qpcr |>
dplyr::count(Fluor, VMRC_Group, Group_Fluor, Target)# A tibble: 15 × 5
Fluor VMRC_Group Group_Fluor Target n
<chr> <chr> <chr> <chr> <int>
1 Cy5 VMRC_1 VMRC_1_Cy5 C0175A1 3432
2 Cy5 VMRC_2 VMRC_2_Cy5 UC101 3432
3 Cy5 VMRC_3 VMRC_3_Cy5 185329 3432
4 Cy5 VMRC_4 VMRC_4_Cy5 FF00004 3432
5 Cy5 VMRC_5 VMRC_5_Cy5 UC119 3432
6 FAM VMRC_1 VMRC_1_FAM C0022A1 3432
7 FAM VMRC_2 VMRC_2_FAM FF00018 3432
8 FAM VMRC_3 VMRC_3_FAM C0028A1 3432
9 FAM VMRC_4 VMRC_4_FAM C0006A1 3432
10 FAM VMRC_5 VMRC_5_FAM FF00064 3432
11 HEX VMRC_1 VMRC_1_HEX C0059E1 3432
12 HEX VMRC_2 VMRC_2_HEX FF00051 3432
13 HEX VMRC_3 VMRC_3_HEX C0112A1 3432
14 HEX VMRC_4 VMRC_4_HEX 122010 3432
15 HEX VMRC_5 VMRC_5_HEX FF00072 3432
For now, we only have data for the 15 LBP strains.
gDNA_Plate_Well is the gDNA plate well; there are 1144 unique gDNA plate wells in total in the format [gDNA_Plate_ID]_[Well] (e.g., 2240890_A4).
VIBR_Sample_ID is the VIBRANT sample ID; there are 1016 unique VIBRANT sample IDs in total.
Dilution_Factor is the dilution factor; there are 4 unique dilution factors in total (5, 10, 20, 30).
Quant_Adjusted is the adjusted quantification value; they range from 0 to 3^{8}.
Copies_per_swab is the number of copies per swab; they range from 0 to 7.5^{10}.
We tidy the column names for consistency and readability.
qpcr <- qpcr |> janitor::clean_names()
colnames(qpcr) [1] "data_row" "well" "fluor"
[4] "content" "replicate" "sample"
[7] "cq" "starting_quantity_sq" "cq_mean"
[10] "plate_id" "plate_number" "g_dna_plate_id"
[13] "vmrc_group" "group_fluor" "target"
[16] "g_dna_plate_well" "vibr_sample_id" "dilution_factor"
[19] "quant_adjusted" "copies_per_swab"
We define a sample_type variable to summarize information from vibr_sample_id and sample columns.
qpcr <-
qpcr |>
mutate(
sample_type =
case_when(
str_detect(vibr_sample_id, "#N/A") & str_detect(sample, "std") ~ "Standard",
str_detect(vibr_sample_id, "#N/A") & str_detect(sample, "water") ~ "Water",
str_detect(vibr_sample_id, "EQ") ~ "Control sample",
str_detect(vibr_sample_id, "empty") ~ "Empty",
TRUE ~ "VIBRANT clinical sample"
)
)qpcr |> dplyr::count(sample_type) |> arrange(-n)# A tibble: 5 × 2
sample_type n
<chr> <int>
1 VIBRANT clinical sample 43650
2 Standard 3465
3 Control sample 1980
4 Empty 1890
5 Water 495
tmp <-
qpcr |>
filter(sample_type == "VIBRANT clinical sample", target == target[1]) |>
group_by(vibr_sample_id) |>
summarize(
n_replicates = n(),
n_plates = plate_number |> unique() |> length()
)
# tmp |> dplyr::count(n_plates) # All samples's replicates are on a single plate
# tmp |> dplyr::count(n_replicates) # All samples have 3 replicatesWe note that all samples’s replicates are on a single plate (plate_number or g_dna_plate_id); and all samples have the same number of replicates (3).
The plate layout is as follows:
qpcr <-
qpcr |>
mutate(well_col = well |> str_sub(1, 1), well_row = well |> str_sub(2,3) |> as.integer()) |>
relocate(well_col, well_row, .after = well)qpcr |>
select(plate_number, well_col, well_row, sample_type, sample) |>
distinct() |>
ggplot() +
aes(x = well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = sample_type) +
geom_tile() +
geom_path(aes(group = sample), col = "black", alpha = 0.5) +
geom_point() +
facet_wrap(~ plate_number, labeller = label_both) +
xlab("Well column") + ylab("Well row") +
labs(caption = "Black lines connect replicates") We also define a unique sample ID according to each sample type.
qpcr <-
qpcr |>
mutate(
sample_id =
case_when(
sample_type == "VIBRANT clinical sample" ~ vibr_sample_id,
sample_type == "Control sample" ~ vibr_sample_id,
sample_type == "Empty" ~ str_c(vibr_sample_id,"_plate_", plate_number |> str_pad(width = 2, pad = 0),"_", sample),
sample_type == "Water" ~ str_c("water_plate", plate_number |> str_pad(width = 2, pad = 0)),
sample_type == "Standard" ~ str_c("std_", content |> str_remove("Std-"), "_plate_", plate_number |> str_pad(width = 2, pad = 0)),
TRUE ~ NA_character_
)
) |>
group_by(sample_id, target) |>
arrange(well_col, well_row) |>
mutate(replicate_nb = row_number()) |>
ungroup() |>
mutate(uid = str_c(sample_id, "_r", replicate_nb))The plate x Fluorescence x Target layout is as follows:
qpcr |>
dplyr::count(fluor, target, plate_id, plate_number) |>
arrange(plate_number, plate_id, fluor) |>
mutate(
target = target |> fct_inorder(),
plate_nb = str_c("plate nb: ", plate_number) |> fct_inorder()
) |>
ggplot() +
aes(
x = plate_id,
y = target |> factor() |> fct_rev(),
fill = fluor
) +
geom_tile() +
facet_grid(. ~ plate_nb, scales = "free", space = "free") +
xlab("Plate ID") +
ylab("Target") +
scale_fill_manual(
name = "Fluor.",
values = c("FAM" = "dodgerblue1", "HEX" = "green3", "Cy5" = "#F8766D")
) +
theme(
legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1)
)Potential batch effects should be checked for plate number (not plate ID).
qpcr |>
dplyr::count(target, dilution_factor) # A tibble: 36 × 3
target dilution_factor n
<chr> <dbl> <int>
1 122010 5 2808
2 122010 10 312
3 122010 20 312
4 185329 20 3120
5 185329 30 312
6 C0006A1 5 2808
7 C0006A1 10 312
8 C0006A1 20 312
9 C0022A1 5 3120
10 C0022A1 10 312
# ℹ 26 more rows
g <-
qpcr |>
ggplot() +
aes(x = target, y = copies_per_swab, col = sample_type, fill = sample_type) +
geom_boxplot(alpha = 0.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g g + scale_y_log10()g <-
qpcr |>
ggplot() +
aes(x = target, y = copies_per_swab, col = sample_type, fill = sample_type) +
geom_boxplot(alpha = 0.5) +
facet_grid(sample_type ~ ., scales = "free") +
guides(fill = "none", color = "none") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0)
)
g g + scale_y_log10()g <-
qpcr |>
ggplot() +
aes(x = plate_number |> factor(), y = copies_per_swab, col = target, fill = target) +
geom_boxplot(alpha = 0.5) +
facet_grid(sample_type ~ ., scales = "free") +
xlab("Plate number") +
theme(
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90, hjust = 0)
)
g g + scale_y_log10() From the plot above, it looks like the empty and water samples have non-zero values on a few plates.
plot_qpcr_empties <- function(qpcr, color_by = "sample_type") {
qpcr |>
filter(sample_type %in% c("Empty", "Water")) |>
group_by(sample, plate_id, plate_number, target) |>
mutate(replicate_nb = row_number()) |>
ungroup() |>
ggplot() +
aes(x = sample, y = copies_per_swab, label = replicate_nb, col = !!sym(color_by)) +
geom_text(size = 3) +
facet_grid(target ~ plate_number, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0)
)
}plot_qpcr_empties(qpcr, color_by = "sample_type") plot_qpcr_empties(qpcr |> mutate(zero_copies = (copies_per_swab == 0)), color_by = "zero_copies") prob_target <-
qpcr |>
select(target) |>
distinct() |>
mutate(prob_target = (target %in% c("C0022A1", "C0059E1", "C0175A1")))We may have to check the values for targets C0175A1, C0022A1, C0059E1.
prob_target <-
prob_target |>
mutate(prob_target = if_else(prob_target, "x", "v"))
qpcr <-
qpcr |>
left_join(prob_target, by = "target") qpcr |>
filter(sample_type %in% c("Control sample")) |>
group_by(sample, plate_id, plate_number, target) |>
mutate(replicate_nb = row_number()) |>
ungroup() |>
ggplot() +
aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = replicate_nb |> factor()) +
geom_text(size = 3) +
scale_y_log10() +
scale_color_discrete("replicate") +
facet_grid(prob_target + target ~ plate_number, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0)
) From these plots, I assume that some of these controls are negative controls and others are positive controls.
TODO: check if they match the sampleID from the metagenomics data or if we need a manifest.
Replicability looks good, but we note the same issues with the problematic targets in the (likely) negative controls as in the empty samples.
TODO
qpcr |>
filter(sample_type %in% c("VIBRANT clinical sample")) |>
group_by(vibr_sample_id, sample, plate_id, plate_number, target) |>
mutate(
replicate_nb = row_number(),
median_cps = median(copies_per_swab, na.rm = TRUE),
mean_cps = mean(copies_per_swab, na.rm = TRUE)
) |>
ungroup() |>
mutate(vibr_sample_id = vibr_sample_id |> fct_reorder(mean_cps)) |>
ggplot() +
aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = replicate_nb |> factor()) +
geom_line(aes(group = vibr_sample_id), col = "black", linewidth = 0.1) +
geom_text(size = 3) +
scale_y_log10() +
scale_color_discrete("replicate") +
scale_x_discrete("Samples", breaks = NULL) +
facet_grid(prob_target + target ~ plate_number, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0)
)We notice the same issues with the problematic targets in plate 1-3 (bottom, left).
These three targets are from the same plates (plate_id column):
qpcr |>
filter(prob_target == "x", plate_number %in% 1:3) |>
select(target, fluor, plate_id, plate_number) |>
distinct() |>
gt()| target | fluor | plate_id | plate_number |
|---|---|---|---|
| C0175A1 | Cy5 | C115281 | 1 |
| C0022A1 | FAM | C115281 | 1 |
| C0059E1 | HEX | C115281 | 1 |
| C0175A1 | Cy5 | C115282 | 2 |
| C0022A1 | FAM | C115282 | 2 |
| C0059E1 | HEX | C115282 | 2 |
| C0175A1 | Cy5 | C115283 | 3 |
| C0022A1 | FAM | C115283 | 3 |
| C0059E1 | HEX | C115283 | 3 |
qpcr |>
filter(prob_target == "x", plate_number %in% 1:3) |>
select(target, fluor, plate_id, plate_number, starts_with("well"), sample, vibr_sample_id, sample_type, copies_per_swab) |>
arrange(sample_type) |>
ggplot() +
aes(x = well_row |> factor(), y = well_col |> fct_rev(), fill = sample_type) +
geom_tile() +
geom_point(aes(size = copies_per_swab |> log10())) +
geom_line(aes(group = sample), col = "black", alpha = 0.5) +
facet_grid(target ~ plate_number) +
coord_fixed() +
xlab("Well column") + ylab("Well row") +
labs(caption = "Black lines connect replicates") There isn’t a clear “geometrical” pattern on the plate. Let’s just assume that these three plates failed.
We could simply discard these values (exclude these targets x plates) or take the minimum when aggregating across replicates (probably not a good idea for the remaining good plates).
SummarizedExperiment objectsWe create two Summarized Experiment objects:
one with all values as provided in the original file, where each “sample” is a plate well and each feature is a target (LBP taxa), with the following assays
cq)starting_quantity_sqcq_meanquant_adjustedcopies_per_swab_raw (current copies_per_swab column)copies_per_swab (where “problematic” targets x plates are set to NA)The colData contains the following columns:
+ `uid`
+ `sample_id`
+ `sample`
+ `vibr_sample_id`
+ `sample_type`
+ `plate_number`
+ `g_dna_plate_id`
+ (`dilution_factor`)
The rowData contains the following columns:
+ `fluor`
+ `plate_ids` (the concatenation of `plate_id` for each `plate_number`)
+ `valid_plates` (the plate_number where signal looked good)one with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including control samples) and each feature is a target (LBP taxa), with the following assays
copies_per_swab (the median copies_per_swab across replicates)copies_per_swab_cv (the coefficient of variation of the copies_per_swab across replicates)copies_per_swab_mean (the mean copies_per_swab across replicates)The colData contains the following columns:
+ `sample_type`
+ `vibr_sample_id`
+ `sample`
+ `plate_number`
+ `g_dna_plate_id`
+ `dilution_factor`
The rowData contains the following columns:
+ `fluor`
+ `vmrc_group`
+ `plate_ids` (the concatenation of `plate_id` for each `plate_number`)
+ `valid_plates` (the plate_number where signal looked good)SummarizedExperiment objectmake_raw_qpcr_SE <- function(qpcr){
qpcr <-
qpcr |>
arrange(plate_number, well_col, well_row) |>
mutate(uid = uid |> fct_inorder()) |>
arrange(plate_number, plate_id, fluor) |>
mutate(target = target |> fct_inorder()) |>
arrange(uid, target)
cq_assay <-
qpcr |>
select(uid, target, cq) |>
arrange() |>
pivot_wider(names_from = uid, values_from = cq) |>
as.data.frame() |>
column_to_rownames("target")
starting_quantity_sq_assay <-
qpcr |>
select(uid, target, starting_quantity_sq) |>
arrange() |>
pivot_wider(names_from = uid, values_from = starting_quantity_sq) |>
as.data.frame() |>
column_to_rownames("target")
cq_mean_assay <-
qpcr |>
select(uid, target, cq_mean) |>
arrange() |>
pivot_wider(names_from = uid, values_from = cq_mean) |>
as.data.frame() |>
column_to_rownames("target")
quant_adjusted_assay <-
qpcr |>
select(uid, target, quant_adjusted) |>
arrange() |>
pivot_wider(names_from = uid, values_from = quant_adjusted) |>
as.data.frame() |>
column_to_rownames("target")
copies_per_swab_raw_assay <-
qpcr |>
select(uid, target, copies_per_swab) |>
arrange() |>
pivot_wider(names_from = uid, values_from = copies_per_swab) |>
as.data.frame() |>
column_to_rownames("target")
coldata <-
qpcr |>
select(uid, sample_id, sample, vibr_sample_id, sample_type, plate_number, g_dna_plate_id) |>
distinct() |>
arrange(uid) |>
as.data.frame() |>
mutate(rownames = uid) |>
column_to_rownames("rownames")
}
SE_qPCR_raw <- make_raw_qpcr_SE(qpcr)TO FINISH
SummarizedExperiment objectTO DO
SummarizedExperiment objectsSave the SE objects to disk